rm(list=ls(all=TRUE))
library(RMySQL)
library(quanteda)
library(stm)

try(setwd("~/Dropbox/Columbia/History Lab/nonoecd"), TRUE)

##### Load and clean data #####

tr <- read.csv("trade_cables.csv")
tr<-subset(tr,tr$body!=""&!grepl("^MRN",tr$body))
tr$prom<-tr$tag==53|tr$tag==56|tr$tag==58|tr$tag==1980 # identify trade-related tags
tr$promo <- ifelse(tr$prom=="TRUE",1,0)

tr$duplicate <- duplicated(tr$id) # identify duplicate observations
tr <- tr[tr$duplicate==F,] # remove duplicate observations

# Add more descriptive trade promotion indicator
tr$type <- ifelse(tr$prom==T,"prom","trade")

# Create corpus and remove stopwords/common stems
trcorp<-corpus(tr, docid_field = 'id',text_field = 'body')
trdfm<-dfm(trcorp,remove_numbers=TRUE, remove_punct = TRUE, remove=stopwords('english'), stem=T)
trdfm<-dfm_select(trdfm,min_nchar=3)
rem.list.s<-c('embassi','u.s','trade','also','may','new','following','one','time','two','made','howev','can','possible',
              'said','amembassi','end','now','since','pleas','well','united','might','can')
trdfm<-dfm_select(trdfm,selection="remove",pattern=rem.list.s)

##### Topic modeling in prom vs. trade documents #####

# Set exclusion thresholds
l = nrow(tr)
lt = round(l*.025)
ut = round(l*.4)

# Prepare documents for STM
processed<-convert(trdfm, to="stm")
out<-prepDocuments(processed$documents,processed$vocab, processed$meta, lower.thresh = lt, upper.thresh=ut)

# Find number of topics
searchK <- searchK(out$documents, out$vocab, K = 6:16, data = out$meta, verbose = F) # model appears to converge around 10 topics 

# Estimate topic model
model <- stm(out$documents, out$vocab, K=10, 
             prevalence = ~ promo, max.em.its = 300,
             data = out$meta, seed = 1234,
             verbose = F)
model1.effect <-  estimateEffect(1:10 ~ promo, model, meta=out$meta, uncertainty = "Global")
summary(model1.effect)

# Plot topic model results -- Figure A1-1
plot(model1.effect, covariate = "promo", 
     #topics = c(1:10), 
     method = "difference", verbose.labels = F,
     cov.value1=TRUE, cov.value2=FALSE, xlab = "Change in topical prevalence from trade to promotion", main = "Topics in Trade vs. Promotion Cables", xlim = c(-.3,.3), 
     #labeltype = "custom", custom.labels = c('Urgency of Action','Just Future')
)

# Output topic lables -- Figure A1-2
plot(model, type = "labels", text.cex=.6,labeltype = "frex", n=10)
labelTopics(model)
